##############################################################
# ESTIMATING CONSTITUENCY PREFERENCES FROM SPARSE SURVEY DATA
# USING AUXILLIARY GEOGRAPHIC INFORMATION
#
#		         -- R and WinBUGS code--
#
# July 2011
# PETER SELB, SIMON MUNZERT
# UNIVERSITY OF KONSTANZ, GERMANY
##############################################################


##############################################################
### Preliminary remark
# We owe many of the computational tricks we use in this code to Bivand et al. (2008), Gelman and Hill (2007), and Gomez-Rubio et al. (2008).


# Set working directory (supplementary data has to be stored in this folder)
setwd("C:/Uni/Research/ConstituencyPrefs/Supplementary Data")


# Load libraries
library(maptools)
library(sp)
library(spdep)
library(R2WinBUGS)
library(foreign)
library(gpclib)

##############################################################
### Create spatial data frames
# Use the readShapePoly function from package "maptools" (Lewin-Koh et al. 2010} to read in the shapefile, which contains several mandatory and optional files located in the same folder. They contain the constituency geometry and are sometimes supplemented with some attribute data (such as surface area etc.). The shapefile of the German electoral districts used here can be found at  http://www.bundeswahlleiter.de/de/bundestagswahlen/BTW_BUND_09/wahlkreiseinteilung/wahlkreisgeometrie/Geometrie_Wahlkreise_17DBT_Shape.zip (accessed at 07/29/2011)

x <- readShapePoly("Geometrie_Wahlkreise_17DBT.shp")

# Use the unionSpatialPolygons function from package "maptools" to aggregate polygons into a SpatialPolygons object,  according to a common identifier (here: WKR_NR). This step is optional in general, but mandatory in our case. Some electoral districts are made up of multiple polygons, i.e. areas which are not directly connected to each other (e.g., islands). 

gpclibPermit() # might be neccesary in order to avoid an error message in the following "unionSpatialPolygons" command
sp.districts <- unionSpatialPolygons(x, x$WKR_NR)
plot(sp.districts)

# Read in unit-level (i.e. individual level) data, containing district ID (WKR_NR), reported voting behavior (y2ch, where y equals name of party) and educational attainment (edu\_i\_1 - edu\_i\_3):

df.unit <- read.dta("vote_unit.dta")

# Read in area-level data (i.e. individual data aggregated on district level) , containing district ID, number of respondents per district n, a cumulative index variable cumn summing up the number of respondents per district, aggregated reported vote shares, mean standard errors of aggregated reported vote shares and true vote shares:

df.area2 <- read.dta("vote_area.dta")

# Extract surface area from the geodata and merge it with area-level data. Surface area as measured in square meters is rescaled by factor 1,000,000 to get square kilometers:

dist.size = as.data.frame(cbind(x$AREA/1000000, x$WKR_NR))
colnames(dist.size) <- c("area","WKR_NR")
size.area <- tapply(dist.size$area, dist.size$WKR_NR, sum)
dist.size.com <- cbind(size.area, c(1:299))
colnames(dist.size.com) <- c("area","WKR_NR")
df.area <- merge(dist.size.com, df.area2, by=c("WKR_NR"))

# Create spatial polygon data frame with function SpatialPolygonsDataFrame from package "sp" (Pebesma 2005). Thereby, external data in df.area is matched with geodata from sp.districts:

spdf.area = SpatialPolygonsDataFrame(sp.districts, df.area)


##############################################################
#Create spatial weights

# The "spdep" package (Bivand et al. 2010) provides a collection of functions to  create spatial weights matrix objects from polygon contiguities or to implement other concepts of contiguity. We use the poly2nb function, which creates a list of queen contiguity neighbors. The queen contiguity is satisfied if two regions share a common border or vertex.

nb.districts <- poly2nb(spdf.area)

# The GeoBUGS module for WinBUGS \cite{Thomas2004} comes with the intrinsic Gaussian CAR prior distribution, which has to be fed with information about the spatial structure in the following way: 1. Create a vector listing the ID numbers of the adjacent areas, 2. Create a vector listing unnormalised weights for each pair of areas, 3. Create a vector of length N (the total number of areas) giving the number of neighbors n\_i for each area, 4. Combine spatial information.

nb <- unlist(nb.districts) 
weight <- rep(1, times=length(nb))
num <- card(nb.districts)
spdata <- list("nb", "weight", "num")


##############################################################
#Simulation using WinBUGS (showcase SPD)

# Data preparation: extract dependent variable (unit level voting behavior, dichotomous for each party) "y", create inverse area variable (Inverting the surface area is coherent with the idea of approximating population density) "distsize", create object for idiosyncratic district effect (districts not covered by the survey are given values of zero, otherwise NA) "u", create district index variable "uniq" and number of districts variable "N", create index for number of non-missing districts "ns", create IDs of non-missing districts "s", generate number of observations per non-missing district "n".

y <- df.unit$spd2ch
distsize <- (df.area$area)^-1
u = ifelse(!is.na(df.area$spd2d), NA, 0)
district.name <- as.vector(df.area$WKR_NAME)
uniq <- unique(district.name)
N <- length(uniq)
ns <- length(df.area$spd2d[!is.na(df.area$spd2d)])
s <- df.area$WKR_NR[!is.na(df.area$spd2d)]
n <- df.area$n[!is.na(df.area$n)]

# Generate cumulative number of observations per non-missing district (the cumn variable has already been prepared externally and sums up the number of observations per district. Together with the index i for the i-th observation in district j and a double loop, the individual outcome can be modeled quite elegantly).

cumn <- df.area$cumn[!is.na(df.area$cumn)]

# Create data list

data1 <- list("N", "y", "ns", "s", "n", "cumn", "u")


### MODEL 1: Empty model with unstructured random effects
# We used the "R2WinBUGS" package to run WinBUGS from within R (Sturtz et al. 2005).

model1 <- function()
{
    for(i in 1:ns)  # loop over non-missing districts
    {
         for(j in 1:n[i])  # loop over individuals in district
         {
		y[cumn[i]+j] ~ dbern(p[cumn[i]+j])
		logit(p[cumn[i]+j]) <- alpha + u[s[i]]
				 }
		u[s[i]] ~ dnorm(0, tauu)  # prior density for unstructured random effects
		}

    for(i in 1:N)  # loop over all districts, computation of mu, the estimated district-specific vote share
    {
    mu[i] <- exp(alpha + u[i]) / (1 + exp(alpha + u[i]))
    }

tauu <- pow(sigmau, -2)  # prior density for inverse variance of unstructured REs
sigmau ~ dunif(0,2)

alpha ~ dflat()  # flat prior for fixed alpha
} 

write.model(model1, "model1.bug")

inits1 <- function() {list(alpha=rnorm(1), sigmau=runif(1),  u=ifelse(is.na(df.area$spd2d), NA, rnorm(N,0,1)))}

model1.sim <- bugs(data=c(data1), inits=inits1, model.file="model1.bug",
parameters.to.save=c("mu", "alpha", "sigmau", "u"), n.chains=3, n.iter=20000,
n.burnin=10000, n.thin=5, bugs.directory="C:/Program Files (x86)/WinBUGS14", debug=T) 

attach.bugs(model1.sim)
plot(model1.sim)
save.image("model1.RData")


### MODEL 2: Unstructured random effects plus district-level covariate log(distsize)

data2 <- list("N", "y", "ns", "s", "n", "cumn", "u", "distsize")

model2 <- function()
{
   for(i in 1:ns)
   {
        for(j in 1:n[i])
        {
	y[cumn[i]+j] ~ dbern(p[cumn[i]+j])
	logit(p[cumn[i]+j]) <- beta[s[i]]
					 }
	u[s[i]] ~ dnorm(0, tauu)
	}

  for(i in 1:N)
  {		
	beta[i] <- beta.0 + beta.distsize*log(distsize[i]) + u[i]
  }

  for(i in 1:N)
  {
  mu[i] <- exp(beta[i]) / (1 + exp(beta[i]))
  }

	beta.0 ~ dflat()
	beta.distsize ~ dflat()
	
	tauu <- pow(sigmau, -2)
	sigmau ~ dunif(0,2)

}

write.model(model2, "model2.bug")

inits2 <- function() {list(beta.0=rnorm(1), beta.distsize=rnorm(1), sigmau=runif(1,0,1), u=ifelse(is.na(df.area$spd2d), NA, rnorm(N,0,1)))}

model2.sim <- bugs(data=c(data2), inits=inits2, model.file="model2.bug",parameters.to.save=c("mu","beta.0", "beta.distsize", "sigmau", "u"), n.chains=3, n.iter=20000, n.burnin=10000, n.thin=5, bugs.directory="C:/Program Files (x86)/WinBUGS14", debug=T) 

attach.bugs(model2.sim)
plot(model2.sim)
save.image("model2.RData")


### MODEL 3: CAR-structured REs plus district-level covariate log(distsize\^-1)

data3 <- list("N", "y", "ns", "s", "n", "cumn", "u", "distsize")

model3 <- function()
{
   for(i in 1:ns)
   {
       for(j in 1:n[i])
   		 {
   y[cumn[i]+j] ~ dbern(p[cumn[i]+j])
   logit(p[cumn[i]+j]) <- beta[s[i]] + v[s[i]]
       }
	 u[s[i]] ~ dnorm(0, tauu)
   }

   for(i in 1:N)
   {		
	 beta[i] <- beta.0 + beta.distsize*log(distsize[i]) + u[i]
   }
		
   for(i in 1:N)
   {
   mu[i] <- exp(beta[i] + v[i]) / (1 + exp(beta[i] + v[i]))
   }

   v[1:N] ~ car.normal(nb[], weight[], num[], tauv) # impose CAR distribution on CAR-structured REs

    beta.0 ~ dflat()
    beta.distsize ~ dflat()

	tauu <- pow(sigmau, -2)
	sigmau ~ dunif(0,2)
	tauv <- pow(sigmav, -2)
	sigmav ~ dunif(0,2)
}


write.model(model3, "model3.bug")

inits3 <- function() {list(beta.0=rnorm(1), beta.distsize=rnorm(1), sigmau=runif(1,0,1), sigmav=runif(1,0,1), u=ifelse(is.na(df.area$spd2d), NA, rnorm(N,0,1)), v=rnorm(N,0,1))}

model3.sim <- bugs(data=c(data3, spdata), inits=inits3,
model.file="model3.bug", parameters.to.save=c("mu", "beta.0", "beta.distsize", "sigmau", "sigmav", "u", "v"), n.chains=3, n.iter=20000, n.burnin=10000, n.thin=10, bugs.directory="C:/Program Files (x86)/WinBUGS14", debug=T) 

attach.bugs(model3.sim)

plot(model3.sim)

save.image("model3.RData")


### MODEL 3p: Combination with poststratification approach:  CAR-structured REs plus district-level covariate log(distsize) plus individual level eduction dummies

# Extract dependent variable (unit level voting behavior for the Greens)

y <- df.unit$gru2ch
distsize <- (df.area$area)^-1
u = ifelse(!is.na(df.area$gru2d), NA, 0)
district.name <- as.vector(df.area$WKR_NAME)
uniq <- unique(district.name)
N <- length(uniq)
ns <- length(df.area$spd2d[!is.na(df.area$gru2d)])
s <- df.area$WKR_NR[!is.na(df.area$gru2d)]
n <- df.area$n[!is.na(df.area$n)]
cumn <- df.area$cumn[!is.na(df.area$cumn)]

# Extract poststratification variables (educational levels; level one is redundant)

edu2 <- df.unit$edu_i_2
edu3 <- df.unit$edu_i_3
edu2[is.na(edu2)] <- 0
edu3[is.na(edu3)] <- 0

# Create object for idiosyncratic district effect (districts not covered by the survey are given values of zero, otherwise NA)

dir.mu <-  df.area$gru2d
u = ifelse(!is.na(dir.mu), NA, 0)

# Set up model

data3 <- list("N", "y", "ns", "s", "n", "cumn", "u", "distsize", "edu2", "edu3")

model3 <- function()
{
   for(i in 1:ns)
   {
        for(j in 1:n[i])
        {
     y[cumn[i]+j] ~ dbern(p[cumn[i]+j])
     logit(p[cumn[i]+j]) <- beta[s[i]] + gamma2*edu2[cumn[i]+j] + gamma3*edu3[cumn[i]+j]  + v[s[i]]
        }
	 u[s[i]] ~ dnorm(0, tauu)
   }

   for(i in 1:N)
   {		
	 beta[i] <- beta.0 + beta.distsize*log(distsize[i]) + u[i]
   }
		
  v[1:N] ~ car.normal(nb[], weight[], num[], tauv)

  beta.0 ~ dflat()
  beta.distsize ~ dflat()
  gamma2 ~ dflat()
  gamma3 ~ dflat()

  tauu <- pow(sigmau, -2)
  sigmau ~ dunif(0,2)
  tauv <- pow(sigmav, -2)
  sigmav ~ dunif(0,2)
}


write.model(model3, "model3.bug")

inits3 <- function() {list(beta.0=rnorm(1), beta.distsize=rnorm(1), gamma2=rnorm(1), gamma3=rnorm(1), sigmau=runif(1), sigmav=runif(1), u=ifelse(is.na(dir.mu), NA, rnorm(N,0,1)), v=rnorm(N,0,1))}

model3.sim <- bugs(data=c(data3, spdata), inits=inits3, model.file="model3.bug", parameters.to.save=c("beta.0", "beta", "gamma2", "gamma3", "sigmau", "sigmav", "u", "v"), n.chains=3, n.iter=30000, n.burnin=20000, n.thin=10, bugs.directory="C:/Program Files (x86)/WinBUGS14", debug=F)

attach.bugs(model3.sim)
plot(model3.sim)

save.image("model3.RData")

### Poststratification

# Get sheet with poststratification cells and attach districts size variable:

df.post <- read.dta("edu.dta")


# Fill n.sims x n.cells sheet with predictions
attach.bugs(model3.sim)
L <- nrow(df.post)
mu.l <- array(NA,c(n.sims,L))
for (l in 1:L){
mu.l[,l] <- inv.logit(beta[l] + gamma2*df.post$EDU2[l] + gamma3*df.post$EDU3[l] + v[,df.post$WKR_NR[l]])
}

# Weight by population shares and collapse to district level:

mu.w <- mu.l*df.post$EDU
mu.j <- array(NA, c(n.sims, N))
for (i in 1:n.sims) {
mu.j[i,] <- tapply(mu.w[i,], df.post$WKR_NR, sum)
}

# Median SAEs plus 90\%-CIs:

carpost.mu <- 0
carpost.sd <- 0
carpost.90lo <- 0
carpost.90hi <- 0
for (i in 1:N){
carpost.mu[i] <- median(mu.j[,i])
carpost.90lo[i] <- quantile(mu.j[,i],probs=c(.05))
carpost.90hi[i] <- quantile(mu.j[,i],probs=c(.95))
}
detach.bugs() # Detach simulation data




##############################################################
### Literature


# Bivand, R. S., E. J. Pebesma and V. Gomez-Rubio. 2008. Applied Spatial Data Analysis with R. Springer.

# Bivand, Roger, with contributions by Micah Altman, Luc Anselin, Renato Assun�c�ao, Olaf Berke, Andrew Bernat, Eric Blankmeyer, Marilia Carvalho, Yongwan Chun, Bjarke Christensen, Carsten Dormann, Stephane Dray, Rein Halbersma, Elias Krainski, Nicholas Lewin-Koh, Hongfei Li, Jielai Ma, Giovanni Millo, Werner Mueller, Hisaji Ono, Pedro Peres-Neto, Gianfranco Piras, Markus Reder, Michael Tiefelsdorf, and Danlin Yu. 2010. spdep: Spatial dependence: weighting schemes, statistics and models. R package version 0.5-11. urlhttp://CRAN.R-project.org/package=spdep

# Gelman, Andrew and Jennifer Hill. 2007. Data Analysis Using Regression and Multilevel/Hierarchical Models. Cambridge: Cambridge University Press.

# Lewin-Koh, Nicholas J., Roger Bivand, contributions by Edzer J. Pebesma, Eric Archer, Adrian Baddeley, Hans-Joerg Bibiko, Stephane Dray, David Forrest, Michael Friendly, Patrick Giraudoux, Duncan Golicher, Virgilio Gomez-Rubio, Patrick Hausmann, Thomas Jagger, Sebastian P. Luque, Don Mac-Queen, Andrew Niccolai, Tom Short and Ben Stabler. 2010. maptools: Tools for reading and handling spatial objects. R package version 0.7-34.

# Pebesma, E.J., R.S. Bivand. 2005. sp: Classes and methods for spatial data in R. R package version 0.9-69. http://cran.r-project.org/web/packages/sp

# Gomez-Rubio, V., N. Best, S. Richardson, G. Li and P. Clarke. 2008. Bayesian Statistics SmallArea Estimation. Working paper. http://www.bias-project.org.uk/papers/BayesianSAE.pdf

# Sturtz, Sibylle, Uwe Ligges and Andrew Gelman. 2005. "R2WinBUGS: A Package for Running WinBUGS from R." Journal of Statistical Software 12:1�16. http://www.jstatsoft.org

# Thomas, Andrew, Nicky Best, Dave Lunn, Richard Arnold and David Spiegelhalter. 2004. GeoBUGS User Manual. 1.2 ed.